if (rstudioapi::isAvailable()) {
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')## [conflicted] Will prefer dplyr::intersect over any other package
all_circ = read_tsv("../data/Supplementary_Table_2_all_circRNAs.txt")## Rows: 1137099 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (16): chr, strand, cell_line, tool, circ_id, circ_id_strand, count_group...
## dbl (7): start, end, BSJ_count, n_detected, n_db, estim_len_in, BSJ_count_m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_circcq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')## Rows: 1560 Columns: 47
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (26): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (21): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sens = read_tsv('../data/Supplementary_Table_6B_sensitivity_values.txt')## Rows: 32 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tool, count_group_median
## dbl (3): nr_detected, nr_expected, sensitivity
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ENST
all_circ %>%
filter(!is.na(ENST), !ENST == 'ambiguous') %>%
select(ENST) %>% unique() %>%
nrow()## [1] 17461
# NA or ambiguous => strand needs to be taken into account
all_circ %>% select(circ_id_strand, ENST) %>% unique() %>%
filter(is.na(ENST) | ENST == 'ambiguous') %>%
count(ENST)with introns
all_circ %>% select(chr, start, end, estim_len_in) %>% unique() %>%
pull(estim_len_in) %>% quantile()## 0% 25% 50% 75% 100%
## 31 454 1660 9925 999911
no introns
all_circ %>% select(chr, start, end, estim_len_ex) %>% unique() %>%
filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
pull(estim_len_ex) %>% quantile()## 0% 25% 50% 75% 100%
## 1 286 498 917 37221
do not get validated (no introns)
cq %>% filter(qPCR_val == 'pass',
RR_val == 'fail',
amp_seq_val == 'pass') %>%
select(chr, start, end, estim_len_ex) %>% unique() %>%
filter(!(is.na(estim_len_ex) | estim_len_ex == 'ambiguous')) %>%
mutate(estim_len_ex = as.numeric(estim_len_ex)) %>%
pull(estim_len_ex) %>% quantile()## 0% 25% 50% 75% 100%
## 115.00 2109.75 2381.50 3483.00 4758.00
all_circ %>%
filter(!tool == "Sailfish-cir") %>%
pull(BSJ_count) %>%
quantile()## 0% 25% 50% 75% 100%
## 1 1 1 3 4007
how many with BSJ < 5
all_circ %>%
filter(!tool == "Sailfish-cir") %>%
count(count_group)round(945201/(945201+146700), 3)## [1] 0.866
how many with BSJ >= 2
all_circ %>%
filter(!tool == "Sailfish-cir") %>%
filter(BSJ_count >= 2) %>% nrow()## [1] 503237
round(503237/1091901, 3)## [1] 0.461
all_circ %>%
filter(!strand == 'unknown') %>%
select(chr, start, end) %>% unique() %>%
nrow()## [1] 174009
all_circ %>%
filter(!strand == 'unknown') %>%
select(chr, start, end, strand) %>% unique() %>%
nrow()## [1] 190314
round((190314-174009)/174009, 3)## [1] 0.094
sens %>%
filter(!count_group_median == 'count ≥ 5') %>%
filter(!tool == 'circRNA_finder',
!tool == 'segemehl') %>%
pull(sensitivity) %>%
quantile()## 0% 25% 50% 75% 100%
## 0.1868132 0.3736264 0.6565934 0.7362637 0.8736264